---
title: "Indonesia Islamist Violence Project"
subtitle: "Generating Ethnic Fractionalization (Village-Level) Data"
author: "Gareth Nellis"
date: "1/25/2019"
output: html_document
---

```{r}
rm(list = ls())
library(tidyverse)
library(haven)
library(foreign)
library(magrittr)
```

```{r}
# load the data
  apsr <- read_dta("../00-data/raw/tso_APSR_2018_rep2.dta")
  podes1 <- read_csv("../00-data/raw/pod2000_01.csv") 
  podes2 <- read_csv("../00-data/raw/pod2000_02.csv")
  village00 <- read_dta("../00-data/raw/2000desapop.dta")
```

# Cleaning of the data objects

```{r}
# vector of variable names: the dvs we want to keep
  tokeep <- c("ethfractvil", "relfractvil", "ethseg_d", "relseg_d")

# clean apsr data
  apsr_c <- apsr %>% 
    distinct() %>%
    select(kabid, popv, propid, area, perc_poorHH, health_a_distance, tokeep) %>% 
    arrange(kabid, desc(popv)) %>% 
    mutate(apsr_id = 1:nrow(.))

# clean 2000 population census village list
  village00_c <- village00 %>% 
    mutate(descode = paste0(prop, kab, kec, desa),
           kabid = as.numeric(paste0(prop, kab))) %>%
    rename(popv = pop) %>% 
    arrange(kabid, desc(popv))

# clean podes 1
  podes1_c <- podes1 %>%
    mutate(descode = paste0(PROP, KAB, KEC, DESA)) %>% 
    select(B4AR2B, descode)  
  
# clean podes 2  
  podes2_c <- podes2 %>% 
    mutate(descode = paste0(PROP, KAB, KEC, DESA),
           kabid = as.numeric(paste0(PROP, KAB))) %>% 
    select(B8R5, B10A, descode, kabid, B8R1A3, B8R1A2)
  
# join podes variables [perfect merge]  
  podes_c <- podes1_c %>% left_join(podes2_c, by = "descode") %>% 
    mutate(perc_poorHH = B8R5/B4AR2B,
           area = B10A/1000,
           merge = 1,
           health_a_distance = ifelse(B8R1A2==0, B8R1A3, 0),
           podes_id = 1:nrow(.)) %>% 
    arrange(kabid, desc(area))
  
# make the merging variables
  apsr_c <- apsr_c %>%
    mutate(
      area = as.character(round(area, 3)), 
      perc_poorHH = as.character(round(perc_poorHH, 5)),
      health_a_distance = as.character(round(health_a_distance, 1))) 
  
  podes_c <- podes_c %>% 
    mutate(
      area = as.character(round(area, 3)),
      perc_poorHH = as.character(round(perc_poorHH, 5)),
      health_a_distance = as.character(round(health_a_distance, 1)))
```


# Split the data according to whether or not PODES data are attached

```{r}
# perform the split
  apsr_c_pnp <- apsr_c %>% 
    split(is.na(.$area) & is.na(.$perc_poorHH) & is.na(.$health_a_distance))  

# object with podes
  apsr_c_p <- apsr_c_pnp$`FALSE`
  
# object without podes  
  apsr_c_np <- apsr_c_pnp$`TRUE`
```


# Merge the part of the APSR paper that has PODES data attached  
  
```{r} 
# keep only the rows that are unqiue according to podes variables (i.e. remove rows with ANY duplicates)  
  apsr_c_p_nd <- apsr_c_p %>%
    group_by(kabid, area, perc_poorHH, health_a_distance) %>%
    filter(n() == 1) %>%
    ungroup()  
  
# ensure podes data have no duplicates
  podes_c_nd <- podes_c %>%
    group_by(kabid, area, perc_poorHH, health_a_distance) %>%
    filter(n() == 1) %>%
    ungroup()
  
# perform the merge
  merged_w_p <- apsr_c_p_nd %>% 
    left_join(podes_c_nd, by = c("kabid", "area", "perc_poorHH", "health_a_distance"))
  
# store the subset not merged
  merged_w_p_fail <- merged_w_p %>% 
    filter(is.na(descode))
  
# store the subset merged
  merged_w_p_success <- merged_w_p %>% 
    filter(!is.na(descode))
  
# assess whether kabid and popv are uniquely identifying in the subset not merged [they are not]
  table(duplicated(merged_w_p_fail[,c("kabid", "popv")]))
  
# keep only non duplicated elements in the failed merge rows
  merged_w_p_fail_nd <- merged_w_p_fail %>% 
    select(kabid, popv, apsr_id, tokeep) %>% 
    group_by(kabid, popv) %>% 
    filter(n() == 1) %>%
    ungroup() %>% 
    mutate(popv = as.character(popv)) 
  
# create a version of the census that has no duplicates according to kabid and popv
  village00_c_nd <- village00_c %>% 
    group_by(kabid, popv) %>% 
    filter(n() == 1) %>%
    ungroup() %>% 
    mutate(popv = as.character(popv))
  
# merge the no duplicates with the census [also no duplicates]
  merged_w_p_fail_nd_c_success <- merged_w_p_fail_nd %>% 
    left_join(village00_c_nd, by = c("kabid", "popv")) %>% 
    filter(!is.na(prop))
```

# Merge the part of the APSR paper that does not have PODES data attached  

```{r}
# ensure no duplicates according to kabid and popv
  apsr_c_np_nd <- apsr_c_np %>% 
    group_by(kabid, popv) %>% 
    filter(n() == 1) %>%
    ungroup() %>% 
    mutate(popv = as.character(popv))

# convert popv to character [some attributes issue]
  village00_c_nd %<>%
    mutate(popv = as.character(popv))

# merge with the part of the census without duplicates
  merged_c_success <- apsr_c_np_nd %>% 
    left_join(village00_c_nd, by = c("kabid", "popv")) %>% 
    filter(!is.na(prop))
```

# Append the successful merges

```{r}
# clean
  merged_w_p_success %<>% 
    select(apsr_id, descode, tokeep, popv) %>% 
    mutate(descode = as.numeric(descode),
           popv = as.numeric(popv))
  
  merged_w_p_fail_nd_c_success %<>% 
    dplyr::select(apsr_id, descode, tokeep, popv) %>% 
    mutate(descode = as.numeric(descode),
           popv = as.numeric(popv))
  
  merged_c_success %<>% 
    select(apsr_id, descode, tokeep, popv) %>% 
    mutate(descode = as.numeric(descode),
           popv = as.numeric(popv))

# merge the successes
  bound <- bind_rows(
    merged_w_p_success, 
    merged_w_p_fail_nd_c_success, 
    merged_c_success)
  
# check for unique apsr ids
  table(duplicated(bound[,c("apsr_id")]))

# what percent did we get?
  nrow(bound)/nrow(apsr_c)
```

# Collapse data to kecamatan, taking weighted average by population

```{r}
# collapse by kecamatan code; **note that we're imputing district means for the segregation data**
  final <- bound %>%
    mutate(kec = str_sub(descode, end=-4)) %>% 
    group_by(kec) %>% 
    summarise(kec_ethfractvil = weighted.mean(ethfractvil, popv, na.rm = T),
              kec_relfractvil = weighted.mean(relfractvil, popv, na.rm = T),
              kec_ethseg_d = weighted.mean(ethseg_d, popv, na.rm = T),
              kec_relseg_d = weighted.mean(relseg_d, popv, na.rm = T),
              kec_pop_total=sum(popv, na.rm = T))
```


# Tidy and save final dataset

```{r}
write.csv(final, "../00-data/final/20190128_eth_rel_kec_data.csv")
```







